Mark-Recapture Workshop 2024

Bayesian Modeling with Stan using CmdStanR

Dr. Matthijs Hollanders

History of Bayesian PPLs

  • MCMC only became feasible with modern computers
  • Bayesian Inference Using Gibbs Sampling (BUGS)
    • BUGS, WinBUGS, OpenBUGS, MultiBUGS
  • JAGS: Just Another Gibbs Sampler
  • NIMBLE:
  • Stan (Carpenter et al. 2017) (named after Stanisław Ulam)
    • State-of-the-art MCMC algorithm: No U-Turn Sampler (NUTS)
    • Gradient-based vs. random walk
    • World class developer team
    • Lots of auxilliary packages for model checking, plotting, goodness-of-fit, etc.

Introducing Stan

  • “Stan is a state-of-the-art platform for statistical modeling and high-performance statistical computation.”
  • Interfaces with R, Python, etc.
  • Specify a log probability density function in a Stan program and fit models to data
    • Logs make everything easier (multiplication becomes addition, etc.)
    • Hamiltonian Monte Carlo (No U-Turn Sampler [NUTS])
    • Variational inference (approximate Bayesian inference)
    • Optimisation (maximum likelihood)
  • We will be using CmdStan using cmdstanr (Gabry et al. 2024) in R

Stan programs

Model of the mean

\[ \begin{aligned} y_i &\sim \mathrm{Normal}(\mu, \sigma), \quad i \in 1:I \\ \mu &\sim p(.) \\ \sigma &\sim p(.) \\ \end{aligned} \]

We want to know \(p(\mu, \sigma \mid y)\)

# generate 100 observations
I <- 100
mu <- 3.7
sigma <- 1.3
y <- rnorm(I, mu, sigma)

# empirical mean and SD
mean(y) ; sd(y)
[1] 3.649769
[1] 1.350531

Static typing: forces intent and catches errors sooner

functions {
  // user-defined functions go here
}
data {
  int<lower=0> I;  // number of observations
  vector[I] y;  // outcome vector
}
transformed data {
  real mean_y = mean(y);
  real sd_y = sd(y);
  print("mean_y: ", mean_y, ", ", "sd_y: ", sd_y);
}
parameters {
  real mu;  // mean
  real<lower=0> sigma;  // SD
}
transformed parameters {
}
model {
  // priors
  mu ~ normal(0, 5);
  sigma ~ exponential(1);
  // likelihood
  y ~ normal(mu, sigma);
}
generated quantities {
  real entropy = 0.5 * log(2 * pi() * e() * square(sigma));
}

Fit models using cdmstanr

# load cmdstanr and set number of cores
library(cmdstanr)
options(mc.cores = 4)

# compile model
# mean_mod <- cmdstan_model(stan_file = "Stan/mean.stan")

# Stan data
mean_dat <- list(I = I, y = y)

# fit the model with NUTS
mean_fit <- mean_mod$sample(data = mean_dat, 
                            iter_warmup = 200, iter_sampling = 200, 
                            save_warmup = T, refresh = 0)
Running MCMC with 4 parallel chains...

Chain 1 mean_y: 3.64977, sd_y: 1.35053 
Chain 2 mean_y: 3.64977, sd_y: 1.35053 
Chain 3 mean_y: 3.64977, sd_y: 1.35053 
Chain 4 mean_y: 3.64977, sd_y: 1.35053 
Chain 1 finished in 0.0 seconds.
Chain 2 finished in 0.0 seconds.
Chain 3 finished in 0.0 seconds.
Chain 4 finished in 0.0 seconds.

All 4 chains finished successfully.
Mean chain execution time: 0.0 seconds.
Total execution time: 0.2 seconds.
  • mod$sample() runs Stan’s HMC and yields a CmdStanMCMC object
  • CmdStanMCMC objects have associated methods
    • fit$draws() returns posterior draws for all quantities defined in parameters, transformed parameters, and generated quantities
    • fit$summary() summarises these quantities (means, uncertainty, diagnostics, etc.)
  • Check MCMC convergence (\(\hat{R} \approx 1\)) and diagnostics, especially divergent transitions

Traceplots and parameter summaries

# traceplots
pacman::p_load(bayesplot)
mean_fit$draws(c("mu", "sigma"), inc_warmup = T) |> 
  mcmc_trace(n_warmup = 200, facet_args = list(ncol = 1))

# summary
mean_fit$summary() |> 
  select(variable, median, contains("q"), ess_bulk, rhat)
# A tibble: 4 × 6
  variable median     q5    q95 ess_bulk  rhat
  <chr>     <dbl>  <dbl>  <dbl>    <dbl> <dbl>
1 lp__     -81.6  -84.0  -80.9      497.  1.00
2 mu         3.65   3.42   3.85     636.  1.01
3 sigma      1.35   1.19   1.54     620.  1.00
4 entropy    1.72   1.60   1.85     620.  1.00

Plot posterior summeries

mean_fit$draws(c("mu", "sigma")) |> 
  mcmc_intervals() + 
  geom_point(aes(value, param),
             data = tibble(param = c("mu", "sigma"), 
                           value = c(mu, sigma)), 
             position = position_nudge(y = 0.1), colour = "red4")

Revisiting priors

pacman::p_load(posterior, tidybayes, distributional, ggdist)
mean_fit |> 
  gather_rvars(mu, sigma) |> 
  mutate(.prior = c(dist_normal(0, 5), 
                    dist_exponential(1))) |> 
  ggplot() + 
  facet_wrap(~ .variable, 
             ncol = 1, 
             scales = "free") + 
  stat_slab(aes(xdist = .prior), 
            alpha = 2/3) + 
  stat_slab(aes(xdist = .value), 
            alpha = 2/3, 
            fill = "red4") + 
  ggeasy::easy_remove_y_axis() + 
  coord_cartesian(expand = F) +
  labs(x = "Prior and posterior distributions")

Two means and their difference

t-test

\[ \begin{aligned} y_i &\sim \mathrm{Normal}(\mu_{[g_i]}, \sigma_{[g_i]}), \quad i \in 1:I, \ g \in 1:2 \\ \mu_1, \mu_2 &\sim p(.) \\ \sigma_1, \sigma_2 &\sim p(.) \\ \end{aligned} \]

# generate 100 draws and plot
mu[2] <- 2.2 ; sigma[2] <- sigma[1]
g <- sample(1:2, I, replace = T)
y <- rnorm(I, mu[g], sigma[g])
tibble(y = y, g = g) |> ggplot(aes(y, factor(g))) + geom_dotplot(dotsize = 1/2) + labs(y = "group")

data {
  int<lower=0> I;  // number of observations
  vector[I] y;  // outcome vector
  array[I] int<lower=1, upper=2> g;  // group identifier
}
parameters {
  vector[2] mu;  // means
  vector<lower=0>[2] sigma;  // SDs
}
model {
  // priors
  mu ~ normal(0, 5);
  sigma ~ exponential(1);
  
  // likelihood
  y ~ normal(mu[g], sigma[g]);
}
generated quantities {
  // difference in means and SDs
  real delta_mu = mu[2] - mu[1];
  real delta_sigma = sigma[2] - sigma[1];
}

Fit the model and assess the difference in groups

# compile model
# t_mod <- cmdstan_model(stan_file = "Stan/t.stan")

# fit the model with NUTS
t_fit <- t_mod$sample(data = list(I = I, y = y, g = g), 
                      refresh = 0)
Running MCMC with 4 parallel chains...
Chain 1 finished in 0.0 seconds.
Chain 2 finished in 0.0 seconds.
Chain 3 finished in 0.1 seconds.
Chain 4 finished in 0.1 seconds.

All 4 chains finished successfully.
Mean chain execution time: 0.0 seconds.
Total execution time: 0.2 seconds.
# check the differences
t_fit$draws(c("delta_mu", "delta_sigma")) |> 
  mcmc_dens_chains()

Another example

Linear regression

\[ \begin{aligned} y_i &\sim \mathrm{Normal}(\mu_i, \sigma), \quad i \in 1:I \\ \mu_i &= \alpha + \beta \cdot x_i \\ \alpha, \beta, \sigma &\sim p(.) \end{aligned} \]

# simulate and plot data
x <- rnorm(I) ; alpha <- -1 ; beta <- 0.5 ; sigma <- 0.8
y <- rnorm(I, alpha + beta * x, sigma)
lm_tbl <- tibble(x = x, y = y)
ggplot(lm_tbl, aes(x, y)) + geom_point()

Stan program

Introducing log_lik and yrep

data {
  int<lower=0> I;  // number of observations
  vector[I] y, x;  // outcome vector and covariate
}
parameters {
  real alpha, beta;  // intercept and slope
  real<lower=0> sigma;  // SD
}
model {
  // priors
  alpha ~ normal(0, 5);
  beta ~ normal(0, 1);
  sigma ~ exponential(1);
  
  // likelihood
  // vector[I] mu = alpha + beta * x;
  y ~ normal(alpha + beta * x, sigma);
  // target += normal_lupdf(y | alpha + beta * x, sigma);
}
generated quantities {
  // pointwise log-likelihood values
  vector[I] log_lik;
  for (i in 1:I) {
    log_lik[i] = normal_lpdf(y[i] | alpha + beta * x[i], sigma);
  }
  
  // posterior predictive distribution
  array[I] real yrep = normal_rng(alpha + beta * x, sigma);
}

Fit model in Stan

# compile model
# lm_mod <- cmdstan_model(stan_file = "Stan/lm.stan")

# Stan data
lm_dat <- list(I = I, y = y, x = x) |> glimpse()
List of 3
 $ I: num 100
 $ y: num [1:100] 0.891 -2.458 -2.36 -2.023 -0.888 ...
 $ x: num [1:100] 2.194 -2.367 0.726 -0.466 0.293 ...
# fit the model with NUTS
lm_fit <- lm_mod$sample(data = lm_dat, refresh = 0)
Running MCMC with 4 parallel chains...

Chain 1 finished in 0.1 seconds.
Chain 2 finished in 0.1 seconds.
Chain 3 finished in 0.1 seconds.
Chain 4 finished in 0.1 seconds.

All 4 chains finished successfully.
Mean chain execution time: 0.1 seconds.
Total execution time: 0.2 seconds.

Parameter summaries

# check output
lm_fit$summary(c("alpha", "beta", "sigma")) |> 
  mutate(truth = c(alpha, beta, sigma)) |> 
  select(-sd, -mad)
# A tibble: 3 × 9
  variable   mean median     q5    q95  rhat ess_bulk ess_tail truth
  <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl>    <dbl>    <dbl> <dbl>
1 alpha    -1.07  -1.07  -1.19  -0.949  1.00    3541.    2424.  -1  
2 beta      0.576  0.576  0.445  0.706  1.00    3753.    2620.   0.5
3 sigma     0.740  0.737  0.653  0.835  1.00    3619.    3086.   0.8

Traceplots

# check traceplots
lm_fit$draws(c("alpha", "beta", "sigma")) |> mcmc_trace()

Making predictions

pacman::p_load(tidybayes, ggdist)

x_pred <- seq(min(x), max(x), length.out = 200)
lm_fit |> 
  spread_draws(alpha, beta, sigma)
# A tibble: 4,000 × 6
   .chain .iteration .draw  alpha  beta sigma
    <int>      <int> <int>  <dbl> <dbl> <dbl>
 1      1          1     1 -1.02  0.608 0.755
 2      1          2     2 -1.13  0.533 0.747
 3      1          3     3 -1.07  0.546 0.683
 4      1          4     4 -1.20  0.557 0.777
 5      1          5     5 -1.14  0.629 0.692
 6      1          6     6 -0.985 0.523 0.762
 7      1          7     7 -0.936 0.505 0.727
 8      1          8     8 -0.946 0.518 0.721
 9      1          9     9 -0.991 0.547 0.742
10      1         10    10 -1.03  0.575 0.746
# ℹ 3,990 more rows

Making predictions

pacman::p_load(tidybayes, ggdist)

x_pred <- seq(min(x), max(x), length.out = 200)
lm_fit |> 
  spread_draws(alpha, beta, sigma) |> 
  crossing(x = x_pred)
# A tibble: 800,000 × 7
   .chain .iteration .draw alpha  beta sigma     x
    <int>      <int> <int> <dbl> <dbl> <dbl> <dbl>
 1      1          1     1 -1.02 0.608 0.755 -2.37
 2      1          1     1 -1.02 0.608 0.755 -2.34
 3      1          1     1 -1.02 0.608 0.755 -2.32
 4      1          1     1 -1.02 0.608 0.755 -2.30
 5      1          1     1 -1.02 0.608 0.755 -2.28
 6      1          1     1 -1.02 0.608 0.755 -2.25
 7      1          1     1 -1.02 0.608 0.755 -2.23
 8      1          1     1 -1.02 0.608 0.755 -2.21
 9      1          1     1 -1.02 0.608 0.755 -2.18
10      1          1     1 -1.02 0.608 0.755 -2.16
# ℹ 799,990 more rows

4000 draws \(\times\) 200 input values = 800,000 rows

Making predictions

pacman::p_load(tidybayes, ggdist)

x_pred <- seq(min(x), max(x), length.out = 200)
lm_fit |> 
  spread_draws(alpha, beta, sigma) |> 
  crossing(x = x_pred) |>
  mutate(mu = alpha + beta * x) |> 
  ggplot(aes(x, mu)) + 
  stat_lineribbon(.width = 0.9, 
                  fill = "blue4", 
                  alpha = 1/2) + 
  geom_point(aes(y = y), 
             data = lm_tbl, 
             alpha = 2/3)

Goodness-of-fit

  • Two main ways of checking fit: cross-validation (CV) and posterior predictive checking (PPC)
  • CV: how well does the model predict unseen or new data?
  • PPC: how well do our model-based predictions match our observed data?
  • Intuition: if our model fits well, then it should generate predictions that look similar to our observed data
pp_check(object = lm_dat$y, 
         yrep = lm_fit$draws("yrep", format = "draws_matrix")[1:50, ], 
         fun = ppc_dens_overlay)

Important

Prior predictive checks simulate potential data from the joint prior distribution. Posterior predictive checks simulate potential data from the joint posterior distribution after learning parameters from data.

Leave-one-out CV

  • Leave-one-out CV: LOO-CV
  • Predictive performance of the model by leaving one observation out at a time
  • Pareto-smoothed importance sampling LOO-CV: PSIS-LOO
  • Requires we need posterior draws for observation-level log_lik
lm_loo <- lm_fit$loo()
lm_loo

Computed from 4000 by 100 log-likelihood matrix.

         Estimate   SE
elpd_loo   -112.8  6.8
p_loo         2.9  0.5
looic       225.6 13.5
------
MCSE of elpd_loo is 0.0.
MCSE and ESS estimates assume MCMC draws (r_eff in [0.7, 1.0]).

All Pareto k estimates are good (k < 0.7).
See help('pareto-k-diagnostic') for details.
plot(lm_loo)

One more example

Logistic ANCOVA?

\[ \begin{aligned} y_i &\sim \mathrm{Bernoulli}(\psi_i), \quad i \in 1:I, \ g \in 1:G \\ \psi_i &= \operatorname{logit}^{-1} (\alpha_{[g_i]} + \beta \cdot x_i) \\ \alpha_{g}, \beta &\sim p(.) \end{aligned} \]

# simulate
G <- 4
g <- sample(1:G, I, replace = T)
x <- rnorm(I)
alpha <- rnorm(G)
beta <- 0.5
inv_logit <- function(p) 1 / (1 + exp(-p))
psi <- inv_logit(alpha[g] + beta * x)
y <- rbinom(I, 1, psi)
# plot
tibble(g = g, x = x, y = y) |> 
  ggplot(aes(x, y)) + 
  facet_wrap(~ g, labeller = label_both) + 
  geom_point(alpha = 1/2) + 
  scale_y_continuous(breaks = seq(0, 1, 0.5))

Stan program

data {
  int<lower=1> I, G;  // number of observations and groups
  array[I] int<lower=0, upper=1> y;  // binary outcome array
  vector[I] x;  // continuous covariate
  array[I] int<lower=1, upper=G> g;  // group identifier
}
parameters {
  vector[G] alpha;  // group-level intercepts
  real beta;  // slope
}
model {
  // priors
  alpha ~ logistic(0, 1);
  beta ~ normal(0, 1);
  
  // likelihood
  y ~ bernoulli_logit(alpha[g] + beta * x);
  // y ~ bernoulli(inv_logit(alpha[g] + beta * x));
}
generated quantities {
  vector[I] log_lik;
  for (i in 1:I) {
    log_lik[i] = bernoulli_logit_lpmf(y[i] | alpha[g[i]] + beta * x[i]);
  }
  array[I] int yrep = bernoulli_logit_rng(alpha[g] + beta * x);
}

Fit the model in Stan

# compile model
# glm_mod <- cmdstan_model(stan_file = "Stan/glm.stan")

# Stan data
glm_dat <- list(I = I, G = G, y = y, x = x, g = g) |> glimpse()
List of 5
 $ I: num 100
 $ G: num 4
 $ y: int [1:100] 0 1 1 0 1 0 0 1 1 0 ...
 $ x: num [1:100] 0.252 -1.797 0.225 0.322 1.324 ...
 $ g: int [1:100] 3 1 1 2 1 4 3 2 4 2 ...
# fit with NUTS
glm_fit <- glm_mod$sample(data = glm_dat, refresh = 0)
Running MCMC with 4 parallel chains...

Chain 1 finished in 0.1 seconds.
Chain 2 finished in 0.1 seconds.
Chain 3 finished in 0.1 seconds.
Chain 4 finished in 0.1 seconds.

All 4 chains finished successfully.
Mean chain execution time: 0.1 seconds.
Total execution time: 0.2 seconds.
# estimates
glm_fit$draws(c("alpha", "beta")) |> 
  mcmc_intervals() + 
  geom_point(aes(value, param),
             data = tibble(param = c(str_c("alpha[", 1:G, "]"), "beta"), 
                           value = c(alpha, beta)), 
             position = position_nudge(y = 0.1), colour = "red4")

Goodness-of-fit

PSIS-LOO

glm_loo <- glm_fit$loo()
plot(glm_loo)

PPC

pp_check(object = glm_dat$y, 
         yrep = glm_fit$draws("yrep", format = "draws_matrix")[1:500, ], 
         group = glm_dat$g, 
         fun = ppc_bars_grouped)

::: fragment

References

Carpenter, Bob, Andrew Gelman, Matthew D. Hoffman, Daniel Lee, Ben Goodrich, Michael Betancourt, Marcus A. Brubaker, Jiqiang Guo, Peter Li, and Allen Riddell. 2017. “Stan: A Probabilistic Programming Language.” Journal of Statistical Software 76: 1. https://doi.org/10.18637/jss.v076.i01.
Gabry, Jonah, Rok Češnovar, Andrew Johnson, and Steve Bronder. 2024. cmdstanr: R Interface to ’CmdStan’.”